Importing the libraries¶
In [ ]:
# Importing the libraries
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
from pyspark.sql.functions import length
from pyspark.sql import functions as F
Exploratory Data Analysis¶
Read Data from Spark¶
Data Cleaning¶
In [ ]:
datastore = 'azureml://datastores/workspaceblobstore/paths/'
submissions_path = 'filtered-submissions'
submissions_df = spark.read.parquet(f"{datastore}{submissions_path}")
# take a subset of columns
df = submissions_df.select("subreddit", "author", "title", "selftext",
"created_utc", "num_comments", "score",
"over_18", "media", "pinned", "locked",
"disable_comments", "domain", "hidden",
"distinguished", "hide_score")
# calculate post length
df = df.withColumn('post_length', length(df.title) + length(df.selftext))
df = df.withColumn('created_utc', F.to_timestamp('created_utc'))
# Extract time-based features
df = df.withColumn('hour_of_day', F.hour('created_utc'))
df = df.withColumn('day_of_week', F.dayofweek('created_utc')) # 1 (Sunday) to 7 (Saturday)
# Map each day of the week from numeric to string
df = df.withColumn('day_of_week_str', F.expr("""
CASE day_of_week
WHEN 1 THEN 'Sunday'
WHEN 2 THEN 'Monday'
WHEN 3 THEN 'Tuesday'
WHEN 4 THEN 'Wednesday'
WHEN 5 THEN 'Thursday'
WHEN 6 THEN 'Friday'
WHEN 7 THEN 'Saturday'
END
"""))
df = df.withColumn('day_of_month', F.dayofmonth('created_utc'))
df = df.withColumn('month', F.month('created_utc'))
df = df.withColumn('year', F.year('created_utc'))
df = df.withColumn('has_media', F.col('media').isNotNull())
df = df.drop(*["media", "disable_comments", "distinguished"])
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 108, 9, Finished, Available)
In [ ]:
PLOT_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "plots")
CSV_DIR = os.path.join("Users/sk2224/fall-2023-reddit-project-team-34/data", "csv")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 105, 8, Finished, Available)
Scatterplot for Engagement metrics for Reddit posts:¶
In [ ]:
df_plotly = df.select(["subreddit", "num_comments", "score", "has_media", "post_length"])
df_plotly = df_plotly.filter(df_plotly.subreddit.isin('movies', 'anime', 'television'))
df_plotly_pd = df_plotly.toPandas()
color_map = {
'movies': '#FF4301',
'anime': '#ff9200',
'television': '#ffe100'
}
# Create the scatter plot with custom colors
fig = px.scatter(
df_plotly_pd,
y='post_length',
x='score',
color='subreddit',
color_discrete_map=color_map, # Use the custom color map
size='num_comments',
labels={'num_comments': 'Number of Comments', 'score': 'Score',
'subreddit': 'Subreddit', 'post_length': 'Post Length'},
title='Engagement Dynamics of Reddit Posts Across Entertainment Subreddits'
)
# Update layout and axis limits
fig.update_layout(plot_bgcolor='white', paper_bgcolor='white')
fig.update_xaxes(range=[0, 50000])
fig.update_yaxes(range=[0, 8000])
# Show the plot
fig.show()
fig.write_html(f"{PLOT_DIR}/engagement_eda.html")
StatementMeta(9d2667d4-d95f-4c64-adf7-bfab734cf5c0, 48, 9, Finished, Available)